Part A¶

CONTEXT:
The data concerns city-cycle fuel consumption in miles per gallon, to be predicted in terms of 3 multivalued discrete and 5 continuous attributes

In [1]:
import numpy as np   
from sklearn.linear_model import LinearRegression
from scipy import stats 
from scipy.stats import zscore
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
from sklearn.cluster import KMeans 
from sklearn.metrics import silhouette_samples, silhouette_score
import pandas as pd    
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline 
sns.set(color_codes=True)
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
In [2]:
#Loading the json file and concatinating it with car names
ca=pd.read_json (r'./Part1+-+Car-Attributes.json')
ca1=pd.read_csv('./Part1+-+Car+name.csv')
car=pd.concat([ca,ca1],axis=1)
car.head()
Out[2]:
mpg cyl disp hp wt acc yr origin car_name
0 18.0 8 307.0 130 3504 12.0 70 1 chevrolet chevelle malibu
1 15.0 8 350.0 165 3693 11.5 70 1 buick skylark 320
2 18.0 8 318.0 150 3436 11.0 70 1 plymouth satellite
3 16.0 8 304.0 150 3433 12.0 70 1 amc rebel sst
4 17.0 8 302.0 140 3449 10.5 70 1 ford torino
In [3]:
row, column = car.shape
print('The dataset contains', row, 'rows and', column, 'columns')
The dataset contains 398 rows and 9 columns
In [4]:
# save this data as to csv,xlsx and json
car.to_csv('mpg.csv', index=False)
car.to_excel('mpg.xlsx', index = False)
car.to_json('mpg.json', orient = 'split', compression = 'infer', index = 'true')
In [5]:
#dropping/ignoring car_name 
car = car.drop('car_name', axis=1)
# Also replacing the categorical var with actual values
car['origin'] = car['origin'].replace({1: 'america', 2: 'europe', 3: 'asia'})
car.head()
Out[5]:
mpg cyl disp hp wt acc yr origin
0 18.0 8 307.0 130 3504 12.0 70 america
1 15.0 8 350.0 165 3693 11.5 70 america
2 18.0 8 318.0 150 3436 11.0 70 america
3 16.0 8 304.0 150 3433 12.0 70 america
4 17.0 8 302.0 140 3449 10.5 70 america
In [6]:
print('The data type of each attribute: \n')
car.info()
The data type of each attribute: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mpg     398 non-null    float64
 1   cyl     398 non-null    int64  
 2   disp    398 non-null    float64
 3   hp      398 non-null    object 
 4   wt      398 non-null    int64  
 5   acc     398 non-null    float64
 6   yr      398 non-null    int64  
 7   origin  398 non-null    object 
dtypes: float64(3), int64(3), object(2)
memory usage: 25.0+ KB
In [7]:
#5-point summary
car.describe().T
Out[7]:
count mean std min 25% 50% 75% max
mpg 398.0 23.514573 7.815984 9.0 17.500 23.0 29.000 46.6
cyl 398.0 5.454774 1.701004 3.0 4.000 4.0 8.000 8.0
disp 398.0 193.425879 104.269838 68.0 104.250 148.5 262.000 455.0
wt 398.0 2970.424623 846.841774 1613.0 2223.750 2803.5 3608.000 5140.0
acc 398.0 15.568090 2.757689 8.0 13.825 15.5 17.175 24.8
yr 398.0 76.010050 3.697627 70.0 73.000 76.0 79.000 82.0
In [8]:
# isdigit()? on 'horsepower' 
hpIsDigit = pd.DataFrame(car.hp.str.isdigit()) 

#print isDigit = False!
car[hpIsDigit['hp'] == False] 
Out[8]:
mpg cyl disp hp wt acc yr origin
32 25.0 4 98.0 ? 2046 19.0 71 america
126 21.0 6 200.0 ? 2875 17.0 74 america
330 40.9 4 85.0 ? 1835 17.3 80 europe
336 23.6 4 140.0 ? 2905 14.3 80 america
354 34.5 4 100.0 ? 2320 15.8 81 europe
374 23.0 4 151.0 ? 3035 20.5 82 america
In [10]:
# Replace missing values with NaN
car = car.replace('?', np.nan)
car[hpIsDigit['hp'] == False]
Out[10]:
mpg cyl disp hp wt acc yr origin
32 25.0 4 98.0 NaN 2046 19.0 71 america
126 21.0 6 200.0 NaN 2875 17.0 74 america
330 40.9 4 85.0 NaN 1835 17.3 80 europe
336 23.6 4 140.0 NaN 2905 14.3 80 america
354 34.5 4 100.0 NaN 2320 15.8 81 europe
374 23.0 4 151.0 NaN 3035 20.5 82 america
In [11]:
#replace the missing values with median value. 
car.median()
Out[11]:
mpg       23.0
cyl        4.0
disp     148.5
hp        93.5
wt      2803.5
acc       15.5
yr        76.0
dtype: float64
In [12]:
#replace hp with median
car['hp'].fillna((car['hp'].median()), inplace=True)
In [13]:
print('The data set has no missing values \n')
car.isnull().sum()
The data set has no missing values 

Out[13]:
mpg       0
cyl       0
disp      0
hp        0
wt        0
acc       0
yr        0
origin    0
dtype: int64
In [14]:
#creating mpg_level attribute using the 25% and 75% of mpg values( <  17 as low, > 17 and <29 as medium, > 29 as high  )
car['mpg_level'] = car['mpg'].apply(lambda x: 'low' if x<17 else 'high' if x>29 else 'medium')
car.head()
Out[14]:
mpg cyl disp hp wt acc yr origin mpg_level
0 18.0 8 307.0 130.0 3504 12.0 70 america medium
1 15.0 8 350.0 165.0 3693 11.5 70 america low
2 18.0 8 318.0 150.0 3436 11.0 70 america medium
3 16.0 8 304.0 150.0 3433 12.0 70 america low
4 17.0 8 302.0 140.0 3449 10.5 70 america medium
In [15]:
#categorical variable
car_cat = car.iloc[:,[1,6,7,8]]
car_cat.head()
Out[15]:
cyl yr origin mpg_level
0 8 70 america medium
1 8 70 america low
2 8 70 america medium
3 8 70 america low
4 8 70 america medium
In [17]:
#numeric variables
car_num=car.drop(['cyl','yr','origin','mpg_level'],axis=1)
car_num.head()
Out[17]:
mpg disp hp wt acc
0 18.0 307.0 130.0 3504 12.0
1 15.0 350.0 165.0 3693 11.5
2 18.0 318.0 150.0 3436 11.0
3 16.0 304.0 150.0 3433 12.0
4 17.0 302.0 140.0 3449 10.5
In [18]:
#plotting categorical variables
fig = plt.figure(1, (14, 8))

for i,car in enumerate(car_cat.columns):
    ax = plt.subplot(2,2,i+1)
    sns.countplot(car_cat[car], order=car_cat[car].value_counts().index)
    ax.set_xlabel(None)
    ax.set_title(f'Distribution of {car}')
    plt.tight_layout()

plt.show()
In [19]:
#plot histograms
car_num.hist(bins = 20, figsize = (10, 8), color = 'blue')
plt.show()
In [20]:
#plot density
plt.figure(figsize=(17, 13))
col = 1
for i in car_num.columns:
    plt.subplot(3, 3, col)
    sns.distplot(car_num[i], color = 'b')
    col += 1 
In [21]:
#joining the categorical and numerical variables
car=pd.concat([car_cat,car_num],axis=1)
In [22]:
#checking for attribute type
car.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   cyl        398 non-null    int64  
 1   yr         398 non-null    int64  
 2   origin     398 non-null    object 
 3   mpg_level  398 non-null    object 
 4   mpg        398 non-null    float64
 5   disp       398 non-null    float64
 6   hp         398 non-null    float64
 7   wt         398 non-null    int64  
 8   acc        398 non-null    float64
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB
In [23]:
#create dummy varibale for mpg_level and origin and creating a seperate datset for k-means and heirarchical clustering
car = pd.get_dummies(car, columns=['origin'])
car = pd.get_dummies(car, columns=['mpg_level'])
car.head()
carH=car.copy()
carK=car.copy()
In [24]:
#pair plot for the numeric attributes
car_attr = car.iloc[:, 0:7]
sns.pairplot(car_attr, diag_kind='kde');
In [25]:
#dropping the created dummy variable
car2=car.drop(['origin_america','origin_asia','origin_europe','mpg_level_high','mpg_level_low','mpg_level_medium'],axis=1)
In [26]:
#checking of outliers
plt.figure(figsize=(25, 20))
col = 1
for i in car2.columns:
    plt.subplot(3, 3, col)
    sns.boxplot(car2[i],color='blue')
    col += 1
In [27]:
#replacing outliers with IQR (Q1 and Q3 +-1.5*IQR)
IQR1 = stats.iqr(car2['hp'], interpolation = 'midpoint')
IQR2 = stats.iqr(car2['acc'], interpolation = 'midpoint')
In [28]:
#Horsepower after imputing outliers
Q3 = car2['hp'].quantile(0.75)
car2['hp'] = np.where(car2["hp"] >(Q3+1.5*IQR1), 198.5,car2['hp'])
sns.boxplot(car2['hp']);
In [29]:
#accelaration after imputing outliers
Q1 = car2['acc'].quantile(0.25)
Q31=car2['acc'].quantile(0.75)
car2['acc'] = np.where(car2["acc"] >(Q31+1.5*IQR2),22.10 ,car2['acc'])
car2['acc'] = np.where(car2["acc"] <(Q1-1.5*IQR2),(Q1-1.5*IQR2),car2['acc'])
sns.boxplot(car2['acc']);
In [30]:
#checking for correlation
plt.figure(figsize=(10,8))
corr=car2.corr()
sns.heatmap(corr,annot=True);

Heirarchical Clustering

In [31]:
#separating numeric variables
cc = car.iloc[:,0:7] 
cc.head()
Out[31]:
cyl yr mpg disp hp wt acc
0 8 70 18.0 307.0 130.0 3504 12.0
1 8 70 15.0 350.0 165.0 3693 11.5
2 8 70 18.0 318.0 150.0 3436 11.0
3 8 70 16.0 304.0 150.0 3433 12.0
4 8 70 17.0 302.0 140.0 3449 10.5
In [32]:
#scaling the variable
cc_z = cc.apply(zscore)
cc_z.head()
Out[32]:
cyl yr mpg disp hp wt acc
0 1.498191 -1.627426 -0.706439 1.090604 0.673118 0.630870 -1.295498
1 1.498191 -1.627426 -1.090751 1.503514 1.589958 0.854333 -1.477038
2 1.498191 -1.627426 -0.706439 1.196232 1.197027 0.550470 -1.658577
3 1.498191 -1.627426 -0.962647 1.061796 1.197027 0.546923 -1.295498
4 1.498191 -1.627426 -0.834543 1.042591 0.935072 0.565841 -1.840117
In [33]:
#calculating pairwise distance using average linkage method
link_method = linkage(cc_z.iloc[:,0:7], method = 'average')
In [34]:
#plotting the H-cluster
plt.figure(figsize=(25, 10))
dendrogram(link_method)
plt.show()

Appers to be to much of a visual clutter, we'll go ahead and cut down the dendrogram to give us 2 clusters/groups

In [35]:
# dendrogram function to arrive at dendrogram
dendrogram(
    link_method,
    truncate_mode='lastp',  
    p=2,  
)
plt.show()
In [36]:
#vieweing the clusters formed
clusters = fcluster(link_method, 2, criterion='maxclust')
clusters
Out[36]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2,
       2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1,
       1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2], dtype=int32)
In [37]:
#attaching the clusters formed to the scales data
cc_z['clusters_H'] = clusters
cc_z.head()
Out[37]:
cyl yr mpg disp hp wt acc clusters_H
0 1.498191 -1.627426 -0.706439 1.090604 0.673118 0.630870 -1.295498 1
1 1.498191 -1.627426 -1.090751 1.503514 1.589958 0.854333 -1.477038 1
2 1.498191 -1.627426 -0.706439 1.196232 1.197027 0.550470 -1.658577 1
3 1.498191 -1.627426 -0.962647 1.061796 1.197027 0.546923 -1.295498 1
4 1.498191 -1.627426 -0.834543 1.042591 0.935072 0.565841 -1.840117 1
In [38]:
#vieweing the distribution of clusters
cc_z.clusters_H.value_counts().sort_index()
Out[38]:
1    100
2    298
Name: clusters_H, dtype: int64
In [39]:
#attaching the clusters formed to the original data
cc['clusters_H']=clusters
carH['clusters_H']=clusters
cc.head()
Out[39]:
cyl yr mpg disp hp wt acc clusters_H
0 8 70 18.0 307.0 130.0 3504 12.0 1
1 8 70 15.0 350.0 165.0 3693 11.5 1
2 8 70 18.0 318.0 150.0 3436 11.0 1
3 8 70 16.0 304.0 150.0 3433 12.0 1
4 8 70 17.0 302.0 140.0 3449 10.5 1
In [40]:
#create a new data set named Hclus
Hclus=cc
Hclus.head()
Out[40]:
cyl yr mpg disp hp wt acc clusters_H
0 8 70 18.0 307.0 130.0 3504 12.0 1
1 8 70 15.0 350.0 165.0 3693 11.5 1
2 8 70 18.0 318.0 150.0 3436 11.0 1
3 8 70 16.0 304.0 150.0 3433 12.0 1
4 8 70 17.0 302.0 140.0 3449 10.5 1
In [41]:
#aggregating the numerical variable with the clusters formed with the mean
aggdata=cc.iloc[:,0:8].groupby('clusters_H').mean()
aggdata['Freq']=cc.clusters_H.value_counts().sort_index()
aggdata
Out[41]:
cyl yr mpg disp hp wt acc Freq
clusters_H
1 7.980000 73.740000 14.684000 345.470000 160.400000 4121.560000 12.702000 100
2 4.607383 76.771812 26.477852 142.404362 85.479866 2584.137584 16.529866 298
In [42]:
#plotting the clusters formed
plt.figure(figsize=(10, 8))
sns.scatterplot(x="mpg", y="hp", hue="clusters_H",
              data=cc_z, 
                    palette=['green','brown']);

K-Means Clustering

In [43]:
#seperating the numeric values
cc = car.iloc[:,0:7] 
cc_z1 = cc.apply(zscore)
cc_z1.head()
Out[43]:
cyl yr mpg disp hp wt acc
0 1.498191 -1.627426 -0.706439 1.090604 0.673118 0.630870 -1.295498
1 1.498191 -1.627426 -1.090751 1.503514 1.589958 0.854333 -1.477038
2 1.498191 -1.627426 -0.706439 1.196232 1.197027 0.550470 -1.658577
3 1.498191 -1.627426 -0.962647 1.061796 1.197027 0.546923 -1.295498
4 1.498191 -1.627426 -0.834543 1.042591 0.935072 0.565841 -1.840117
In [44]:
#calculatint the within sum of squares
wss =[] 
for i in range(1,5):
    KM = KMeans(n_clusters=i)
    KM.fit(cc_z1)
    wss.append(KM.inertia_)
wss
Out[44]:
[2785.9999999999995, 1294.841895072732, 946.0197908553794, 738.3743876111234]
In [45]:
#plotting the WSS against the number of cluster to come up with optimal number of clusters using Elbow-method
plt.plot(range(1,5), wss);
plt.title('Elbow Method');
plt.xlabel("Number of Clusters")
plt.ylabel("WSS");
In [46]:
#using 2 centroids for clustering
k_means = KMeans(n_clusters = 2)
k_means.fit(cc_z1)
labels = k_means.labels_
In [47]:
# Calculating silhouette_score
silhouette_score(cc_z1,labels)
Out[47]:
0.48235946103916116
In [48]:
#calculating silhouette score for different centroids
kmeans_kwargs = {
   "init": "random",
   "n_init": 10,
   "max_iter": 300,
   "random_state": 42,
}


silhouette_coefficients = []

 # Notice you start at 2 clusters for silhouette coefficient
for k in range(2, 7):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(cc_z1)
    score = silhouette_score(cc_z1,kmeans.labels_)
    silhouette_coefficients.append(score)
In [49]:
#plotting silhouette score for different centroids
plt.plot(range(2, 7), silhouette_coefficients)
plt.xticks(range(2, 7))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()
In [50]:
#attaching the labels to the datasets
cc["cluster_K"] = labels
carK['cluster_K']=labels
Kclus=cc
Kclus.head()
Out[50]:
cyl yr mpg disp hp wt acc cluster_K
0 8 70 18.0 307.0 130.0 3504 12.0 1
1 8 70 15.0 350.0 165.0 3693 11.5 1
2 8 70 18.0 318.0 150.0 3436 11.0 1
3 8 70 16.0 304.0 150.0 3433 12.0 1
4 8 70 17.0 302.0 140.0 3449 10.5 1
In [51]:
#vieweing the distribution of clusters
cc.cluster_K.value_counts().sort_index()
Out[51]:
0    293
1    105
Name: cluster_K, dtype: int64
In [52]:
#attaching the clsuters to the scaled data
cc_z1["cluster_K"] = labels
cc_z1.head()
Out[52]:
cyl yr mpg disp hp wt acc cluster_K
0 1.498191 -1.627426 -0.706439 1.090604 0.673118 0.630870 -1.295498 1
1 1.498191 -1.627426 -1.090751 1.503514 1.589958 0.854333 -1.477038 1
2 1.498191 -1.627426 -0.706439 1.196232 1.197027 0.550470 -1.658577 1
3 1.498191 -1.627426 -0.962647 1.061796 1.197027 0.546923 -1.295498 1
4 1.498191 -1.627426 -0.834543 1.042591 0.935072 0.565841 -1.840117 1
In [53]:
#aggregating the numerical variable with the clusters formed with the mean
aggdata=cc.iloc[:,0:8].groupby('cluster_K').mean()
aggdata['Freq']=cc.cluster_K.value_counts().sort_index()
aggdata
Out[53]:
cyl yr mpg disp hp wt acc Freq
cluster_K
0 4.569966 76.822526 26.619113 140.250853 85.061433 2567.860068 16.535836 293
1 7.923810 73.742857 14.851429 341.809524 158.000000 4093.771429 12.867619 105

Clearly shows two disting group with a difference in average between the clusters and variables

In [54]:
#plotting the clusters
plt.figure(figsize=(10, 8))
sns.scatterplot(x="mpg", y="hp", hue="cluster_K",
              data=cc_z1, 
                    palette=['green','brown']);
In [55]:
carH.clusters_H.value_counts().sort_index()
Out[55]:
1    100
2    298
Name: clusters_H, dtype: int64
In [56]:
carK.cluster_K.value_counts().sort_index()
Out[56]:
0    293
1    105
Name: cluster_K, dtype: int64
In [57]:
carH.shape
Out[57]:
(398, 14)
In [58]:
carK.shape
Out[58]:
(398, 14)
In [59]:
car.head()
Out[59]:
cyl yr mpg disp hp wt acc origin_america origin_asia origin_europe mpg_level_high mpg_level_low mpg_level_medium
0 8 70 18.0 307.0 130.0 3504 12.0 1 0 0 0 0 1
1 8 70 15.0 350.0 165.0 3693 11.5 1 0 0 0 1 0
2 8 70 18.0 318.0 150.0 3436 11.0 1 0 0 0 0 1
3 8 70 16.0 304.0 150.0 3433 12.0 1 0 0 0 1 0
4 8 70 17.0 302.0 140.0 3449 10.5 1 0 0 0 0 1

Linear regression on the original dataset

In [60]:
X = car.drop(['mpg','origin_europe','mpg_level_low'], axis=1)
# the dependent variable
y = car[['mpg']]
In [61]:
# Split X and y into training and test set in 70:30 ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=15)
In [62]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
Out[62]:
LinearRegression()
In [63]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
The coefficient for cyl is -0.5134441386218828
The coefficient for yr is 0.4434650429116842
The coefficient for disp is 0.010688858394646887
The coefficient for hp is 0.010315514536314008
The coefficient for wt is -0.004538788568737129
The coefficient for acc is 0.19183425608862537
The coefficient for origin_america is -1.7306209513688993
The coefficient for origin_asia is -0.8976724344009405
The coefficient for mpg_level_high is 8.552374663817027
The coefficient for mpg_level_medium is 1.5941218694850492
In [64]:
intercept = regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))
The intercept for our model is -1.6635717568652169
In [65]:
regression_model.score(X_train, y_train)
Out[65]:
0.8967703023839786
In [66]:
O=regression_model.score(X_test, y_test)
O
Out[66]:
0.9037421476349174

Linear regression on data with K means cluster

In [67]:
#renaming the cluster labels to light and heavy vehicles and creating dummy variables of it
carK['cluster_K']=carK['cluster_K'].astype('category')
carK['cluster_K'] = carK['cluster_K'].replace({1: 'heavy', 0: 'light'})
carK = pd.get_dummies(carK, columns=['cluster_K'])
In [68]:
carK.head()
Out[68]:
cyl yr mpg disp hp wt acc origin_america origin_asia origin_europe mpg_level_high mpg_level_low mpg_level_medium cluster_K_light cluster_K_heavy
0 8 70 18.0 307.0 130.0 3504 12.0 1 0 0 0 0 1 0 1
1 8 70 15.0 350.0 165.0 3693 11.5 1 0 0 0 1 0 0 1
2 8 70 18.0 318.0 150.0 3436 11.0 1 0 0 0 0 1 0 1
3 8 70 16.0 304.0 150.0 3433 12.0 1 0 0 0 1 0 0 1
4 8 70 17.0 302.0 140.0 3449 10.5 1 0 0 0 0 1 0 1
In [69]:
X = carK.drop(['mpg','origin_europe','mpg_level_low','cluster_K_light'], axis=1)
# the dependent variable
y = carK[['mpg']]
In [70]:
# Split X and y into training and test set in 70:30 ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=12)
In [71]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
Out[71]:
LinearRegression()
In [72]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
The coefficient for cyl is -1.1945995644777805
The coefficient for yr is 0.43186510415059826
The coefficient for disp is 0.017477496279110098
The coefficient for hp is -0.010138045835905891
The coefficient for wt is -0.0040684301693864056
The coefficient for acc is 0.1856482874624993
The coefficient for origin_america is -1.6918315494304086
The coefficient for origin_asia is -0.7407779192303001
The coefficient for mpg_level_high is 9.28312093915688
The coefficient for mpg_level_medium is 2.25000171423125
The coefficient for cluster_K_heavy is 2.511514014338475
In [73]:
intercept = regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))
The intercept for our model is 1.2041468067173469
In [74]:
regression_model.score(X_train, y_train)
Out[74]:
0.8942370456543635
In [75]:
K=regression_model.score(X_test, y_test)
K
Out[75]:
0.9117893808052382

Linear regression on data with H-clusters

In [76]:
#renaming the cluster labels to light and heavy vehicles and creating summy variable of it
carH['clusters_H']=carH['clusters_H'].astype('category')
carH['clusters_H'] = carH['clusters_H'].replace({1: 'heavy', 2: 'light'})
carH = pd.get_dummies(carH, columns=['clusters_H'])
In [77]:
X = carH.drop(['mpg','origin_europe','mpg_level_low','clusters_H_light'], axis=1)
# the dependent variable
y = carH[['mpg']]
In [78]:
# Split X and y into training and test set in 70:30 ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)
In [79]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
Out[79]:
LinearRegression()
In [80]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
The coefficient for cyl is -1.0104832432576671
The coefficient for yr is 0.4475417357550161
The coefficient for disp is 0.015115200524614403
The coefficient for hp is -0.013301584387234493
The coefficient for wt is -0.00426417978067245
The coefficient for acc is 0.11805139164484575
The coefficient for origin_america is -2.1174569315391154
The coefficient for origin_asia is -1.3974915348558108
The coefficient for mpg_level_high is 8.565948239298274
The coefficient for mpg_level_medium is 1.6577250698582813
The coefficient for clusters_H_heavy is 2.038974468807404
In [81]:
intercept = regression_model.intercept_[0]
print("The intercept for our model is {}".format(intercept))
The intercept for our model is 2.572729318233023
In [82]:
regression_model.score(X_train, y_train)
Out[82]:
0.8988409890950728
In [83]:
H=regression_model.score(X_test, y_test)
H
Out[83]:
0.9010238373846703
In [84]:
modellists = []
modellists.append(['Linear Regression on Original Data set', O*100])
modellists.append(['Linear Regression with K means clusters', K*100])
modellists.append(['Linear Regression with Heirarchical clusters', H*100])
mdl_df = pd.DataFrame(modellists, columns = ['Model','r^2 on Test'])
mdl_df
Out[84]:
Model r^2 on Test
0 Linear Regression on Original Data set 90.374215
1 Linear Regression with K means clusters 91.178938
2 Linear Regression with Heirarchical clusters 90.102384

Summary:
K-means appears to explain the highest variation in the datset, but with a difference of only 1% when compared with other models, to get more clarity a larger dataset may be used, since this is a dataset of used cars it doesn't give us how many previous owners has the cars seen which might be helful variable,the gender of the previous owners, the reason/purpose that the cars were being used is also an important factor which the dataset doen't capture. With the above mentioned features it may be possible to get a higher accuracy or explainability of the models and its variables.

PART B¶

CONTEXT: The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.

In [85]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn import metrics
In [88]:
#import the dataset
ve = pd.read_csv('./vehicle.csv')
ve.head()
Out[88]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
0 95 48.0 83.0 178.0 72.0 10 162.0 42.0 20.0 159 176.0 379.0 184.0 70.0 6.0 16.0 187.0 197 van
1 91 41.0 84.0 141.0 57.0 9 149.0 45.0 19.0 143 170.0 330.0 158.0 72.0 9.0 14.0 189.0 199 van
2 104 50.0 106.0 209.0 66.0 10 207.0 32.0 23.0 158 223.0 635.0 220.0 73.0 14.0 9.0 188.0 196 car
3 93 41.0 82.0 159.0 63.0 9 144.0 46.0 19.0 143 160.0 309.0 127.0 63.0 6.0 10.0 199.0 207 van
4 85 44.0 70.0 205.0 103.0 52 149.0 45.0 19.0 144 241.0 325.0 188.0 127.0 9.0 11.0 180.0 183 bus
In [89]:
#checking for the dimension of the data
rows, column = ve.shape
print('The dataset contains', rows, 'rows and', column, 'columns.')
The dataset contains 846 rows and 19 columns.
In [90]:
#checking for the data type
print('The data type of each attribute: \n')
ve.info()
The data type of each attribute: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 19 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   compactness                  846 non-null    int64  
 1   circularity                  841 non-null    float64
 2   distance_circularity         842 non-null    float64
 3   radius_ratio                 840 non-null    float64
 4   pr.axis_aspect_ratio         844 non-null    float64
 5   max.length_aspect_ratio      846 non-null    int64  
 6   scatter_ratio                845 non-null    float64
 7   elongatedness                845 non-null    float64
 8   pr.axis_rectangularity       843 non-null    float64
 9   max.length_rectangularity    846 non-null    int64  
 10  scaled_variance              843 non-null    float64
 11  scaled_variance.1            844 non-null    float64
 12  scaled_radius_of_gyration    844 non-null    float64
 13  scaled_radius_of_gyration.1  842 non-null    float64
 14  skewness_about               840 non-null    float64
 15  skewness_about.1             845 non-null    float64
 16  skewness_about.2             845 non-null    float64
 17  hollows_ratio                846 non-null    int64  
 18  class                        846 non-null    object 
dtypes: float64(14), int64(4), object(1)
memory usage: 125.7+ KB
In [91]:
print('Duplicated rows: ', ve[ve.duplicated()].shape[0])
Duplicated rows:  0
In [92]:
print('Null values:\n', ve.isnull().sum())
Null values:
 compactness                    0
circularity                    5
distance_circularity           4
radius_ratio                   6
pr.axis_aspect_ratio           2
max.length_aspect_ratio        0
scatter_ratio                  1
elongatedness                  1
pr.axis_rectangularity         3
max.length_rectangularity      0
scaled_variance                3
scaled_variance.1              2
scaled_radius_of_gyration      2
scaled_radius_of_gyration.1    4
skewness_about                 6
skewness_about.1               1
skewness_about.2               1
hollows_ratio                  0
class                          0
dtype: int64
In [93]:
#replacing the missing values with median values
for cols in ve.columns:
    if(cols != 'class'): 
        ve[cols] = ve[cols].fillna(ve[cols].median())
In [94]:
#5-point summary
ve.describe().T
Out[94]:
count mean std min 25% 50% 75% max
compactness 846.0 93.678487 8.234474 73.0 87.00 93.0 100.00 119.0
circularity 846.0 44.823877 6.134272 33.0 40.00 44.0 49.00 59.0
distance_circularity 846.0 82.100473 15.741569 40.0 70.00 80.0 98.00 112.0
radius_ratio 846.0 168.874704 33.401356 104.0 141.00 167.0 195.00 333.0
pr.axis_aspect_ratio 846.0 61.677305 7.882188 47.0 57.00 61.0 65.00 138.0
max.length_aspect_ratio 846.0 8.567376 4.601217 2.0 7.00 8.0 10.00 55.0
scatter_ratio 846.0 168.887707 33.197710 112.0 147.00 157.0 198.00 265.0
elongatedness 846.0 40.936170 7.811882 26.0 33.00 43.0 46.00 61.0
pr.axis_rectangularity 846.0 20.580378 2.588558 17.0 19.00 20.0 23.00 29.0
max.length_rectangularity 846.0 147.998818 14.515652 118.0 137.00 146.0 159.00 188.0
scaled_variance 846.0 188.596927 31.360427 130.0 167.00 179.0 217.00 320.0
scaled_variance.1 846.0 439.314421 176.496341 184.0 318.25 363.5 586.75 1018.0
scaled_radius_of_gyration 846.0 174.706856 32.546277 109.0 149.00 173.5 198.00 268.0
scaled_radius_of_gyration.1 846.0 72.443262 7.468734 59.0 67.00 71.5 75.00 135.0
skewness_about 846.0 6.361702 4.903244 0.0 2.00 6.0 9.00 22.0
skewness_about.1 846.0 12.600473 8.930962 0.0 5.00 11.0 19.00 41.0
skewness_about.2 846.0 188.918440 6.152247 176.0 184.00 188.0 193.00 206.0
hollows_ratio 846.0 195.632388 7.438797 181.0 190.25 197.0 201.00 211.0
In [95]:
#plotting the distribution of the numerical variables
ve.hist(bins = 20, figsize = (20, 18), color = 'blue')
plt.show()
In [96]:
#Density plots
plt.figure(figsize=(20, 18))
col = 1
for i in ve.drop(columns='class').columns:
    plt.subplot(4, 5, col)
    sns.distplot(ve[i], color = 'b')
    col += 1 
In [97]:
#checking the distribution of the class variable
print(ve['class'].value_counts())
plt.title('Count of Vehicle Class column')
sns.countplot(x = 'class', data = ve);
car    429
bus    218
van    199
Name: class, dtype: int64
In [98]:
#boxplots for outliers
plt.figure(figsize=(25,23))
col = 1
for i in ve.drop(columns='class').columns:
    plt.subplot(6, 4, col)
    sns.boxplot(ve[i],color='blue')
    col += 1
In [99]:
#Boxplots against the class with other numeric variables
fig,axs = plt.subplots(4,5,figsize=(20,18))
sns.boxplot(x="class", y="compactness",data=ve,ax=axs[0,0]);
sns.boxplot(x="class", y="circularity",data=ve,ax=axs[0,1]);
sns.boxplot(x="class", y="distance_circularity",data=ve,ax=axs[0,2]);
sns.boxplot(x="class", y="radius_ratio",data=ve,ax=axs[0,3]);
sns.boxplot(x="class", y="pr.axis_aspect_ratio",data=ve,ax=axs[0,4]);
sns.boxplot(x="class", y="max.length_aspect_ratio",data=ve,ax=axs[1,0]);
sns.boxplot(x="class", y="scatter_ratio",data=ve,ax=axs[1,1]);
sns.boxplot(x="class", y="elongatedness",data=ve,ax=axs[1,2]);
sns.boxplot(x="class", y="pr.axis_rectangularity",data=ve,ax=axs[1,3]);
sns.boxplot(x="class", y="max.length_rectangularity",data=ve,ax=axs[1,4]);
sns.boxplot(x="class", y="scaled_variance",data=ve,ax=axs[2,0]);
sns.boxplot(x="class", y="scaled_variance.1",data=ve,ax=axs[2,1]);
sns.boxplot(x="class", y="scaled_radius_of_gyration",data=ve,ax=axs[2,2]);
sns.boxplot(x="class", y="scaled_radius_of_gyration.1",data=ve,ax=axs[2,3]);
sns.boxplot(x="class", y="skewness_about",data=ve,ax=axs[2,4]);
sns.boxplot(x="class", y="skewness_about.1",data=ve,ax=axs[3,0]);
sns.boxplot(x="class", y="skewness_about.2",data=ve,ax=axs[3,1]);
sns.boxplot(x="class", y="hollows_ratio",data=ve,ax=axs[3,2]);
fig.tight_layout()

There is significant difference between classes when compared with the mean and median with all the numeric attributes

In [100]:
#find the outliers and replace them by median
for col_name in ve.drop(columns = 'class').columns:
    q1 = ve[col_name].quantile(0.25)
    q3 = ve[col_name].quantile(0.75)
    iqr = q3 - q1
    
    low = q1 - 1.5 * iqr
    high = q3 + 1.5 * iqr
    
    ve.loc[(ve[col_name] < low) | (ve[col_name] > high), col_name] = ve[col_name].median()
In [101]:
#boxplot after outlier treatment
plt.figure(figsize=(25,23))
col = 1
for i in ve.drop(columns='class').columns:
    plt.subplot(6, 4, col)
    sns.boxplot(ve[i],color='blue')
    col += 1
In [102]:
#checking for correlation
plt.figure(figsize=(20,18))
corr=ve.corr()
sns.heatmap(corr,annot=True);
In [103]:
#spliiting the data to check for correlation with the class variable
X = ve.loc[:, ve.columns != 'class']
y = ve['class'].astype('category').cat.codes
In [104]:
#plotting the correlation with target variable
plt.figure(figsize = (15, 8))
ax=sns.barplot(x=X.columns, y = X.corrwith(y))
sns.barplot(x = X.columns, y = X.corrwith(y))
plt.title('Correlation with Class column', fontsize = 20)
x=plt.setp(ax.get_xticklabels(), rotation=90)

PCA

In [105]:
#scaling the muerica variables
XScaled=X.apply(zscore)
XScaled.head()
Out[105]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio
0 0.160580 0.518073 0.057177 0.300945 1.933135 0.912212 -0.207598 0.136262 -0.224342 0.758332 -0.400771 -0.337407 0.285705 -0.315806 -0.032330 0.387162 -0.312012 0.183957
1 -0.325470 -0.623732 0.120741 -0.850666 -0.740596 0.427456 -0.599423 0.520519 -0.610886 -0.344578 -0.594220 -0.618623 -0.513630 0.009122 0.624090 0.161740 0.013265 0.452977
2 1.254193 0.844303 1.519141 1.265808 0.863642 0.912212 1.148719 -1.144597 0.935290 0.689401 1.114582 1.131806 1.392477 0.171586 1.718123 -0.401818 -0.149374 0.049447
3 -0.082445 -0.623732 -0.006386 -0.290423 0.328896 0.427456 -0.750125 0.648605 -0.610886 -0.344578 -0.916635 -0.739145 -1.466683 -1.453054 -0.032330 -0.289106 1.639649 1.529056
4 -1.054545 -0.134387 -0.769150 1.141310 -0.027601 -0.057300 -0.599423 0.520519 -0.610886 -0.275646 1.694930 -0.647319 0.408680 -0.072110 0.624090 -0.176395 -1.450481 -1.699181
In [106]:
#plotting the cummulative variance explained by the principal componets
pca = PCA()
X_pca_ = pca.fit_transform(XScaled)
plt.figure(figsize = (12, 8))
plt.plot((np.cumsum(pca.explained_variance_ratio_) * 100), marker = 'X')
plt.xlim(0, 18)
plt.xlabel('Number of Components')
plt.ylabel('Percentage of Cumulative Explained Variance');
In [107]:
print(pca.explained_variance_)
[9.74940269e+00 3.35071912e+00 1.19238155e+00 1.13381916e+00
 8.83997312e-01 6.66265745e-01 3.18150910e-01 2.28179142e-01
 1.31018595e-01 7.98619108e-02 7.33979478e-02 6.46162669e-02
 4.01448646e-02 3.22758478e-02 2.93936408e-02 2.27005257e-02
 1.98136761e-02 5.16287320e-03]
In [108]:
#plotting the
plt.figure(figsize = (12, 8))
plt.step(list(range(18)), (np.cumsum(pca.explained_variance_ratio_) * 100), where = 'mid')
plt.xlim(0, 18)
plt.xlabel('Number of Components')
plt.ylabel('Percentage of Cumulative Explained Variance')
plt.title('Vehicle Dataset Explained Variance');

We can see that the first six components explain more than 95% of variation. Between first 5 components, more than 91% of the information is captured. The above plot shows almost 95% variance by the first 6 components. Therefore we can drop 7th component onwards.

In [109]:
#Using 6 components and printing the eigen vectors
pca3 = PCA(n_components=6)
pca3.fit(XScaled)
print(pca3.components_)
print(pca3.explained_variance_ratio_)
Xpca3 = pca3.transform(XScaled)
[[ 0.27250289  0.28725469  0.30242111  0.26971354  0.09786073  0.19520014
   0.31052393 -0.3090069   0.307287    0.27815416  0.29976509  0.30553237
   0.26323762 -0.04193594  0.03608321  0.05872048  0.03801314  0.08474   ]
 [-0.08704358  0.13162176 -0.04614301 -0.19793126 -0.25783995 -0.10804563
   0.07528535 -0.01322994  0.0875602   0.12215424  0.07726575  0.07150302
   0.21058205  0.50362158 -0.01576632 -0.09274624 -0.50162122 -0.50761211]
 [-0.03818521 -0.20114691  0.06346211  0.05628517 -0.06199275 -0.14895782
   0.10904283 -0.09085269  0.1060705  -0.21368469  0.1445998   0.11034374
  -0.20287019  0.07386402 -0.55917399  0.6706805  -0.06224071 -0.04170535]
 [ 0.13867501 -0.03805548  0.10895429 -0.25435509 -0.61276572  0.27867816
   0.00539295  0.06521486  0.03089915  0.04146747 -0.06400509 -0.00219687
  -0.08553965 -0.11539962  0.47370331  0.42842603 -0.0274096   0.09603749]
 [ 0.13710147 -0.13899555 -0.08001743  0.13374437  0.12360146 -0.63489336
   0.08555745 -0.07907344  0.08164638 -0.25111294  0.14747123  0.11010098
  -0.00521211  0.1380686   0.56655224  0.13086982  0.18051929 -0.11078807]
 [ 0.26361138 -0.07134741 -0.01690062 -0.13818366 -0.57782861 -0.289097
   0.09774711 -0.07572829  0.10540323 -0.07819622  0.13291241  0.11539822
  -0.0670574  -0.13151308 -0.31917609 -0.46840497  0.28013644  0.05944441]]
[0.54099325 0.18593103 0.06616512 0.0629155  0.04905291 0.03697101]
In [110]:
#printing the original features and the reduced features
pca_6 = PCA(n_components = 6)
X_pca = pca_6.fit_transform(XScaled)
print('Original number of features:', X.shape[1])
print('Reduced number of features:', X_pca.shape[1])
Original number of features: 18
Reduced number of features: 6
In [111]:
#viewing the first 5 observations of the pca components
pca_df = pd.DataFrame(data = X_pca)
pca_df.head()
Out[111]:
0 1 2 3 4 5
0 0.584228 -0.675673 -0.453334 -0.750656 -0.777515 -1.848809
1 -1.512180 -0.348934 -0.333436 1.268953 -0.324929 -0.118317
2 3.913448 0.234507 -1.265094 0.137224 0.915751 -0.685594
3 -1.535193 -3.044413 -0.469623 0.324317 -0.611590 0.367777
4 -0.642062 1.488882 -0.246288 -0.550939 0.471655 -1.012698
In [112]:
sns.pairplot(pca_df, diag_kind = 'kde');

SVM

In [113]:
#splitting the original data into train and test 70:30
X_train, X_test, y_train, y_test = train_test_split(XScaled, y, test_size = 0.3, random_state = 10)
In [114]:
rtr, ctr = X_train.shape
print('The training set comprises of', rtr, 'rows and', ctr, 'columns.')
The training set comprises of 592 rows and 18 columns.
In [115]:
rt, ct = X_test.shape
print('The test set comprises of', rt, 'rows and', ct, 'columns.')
The test set comprises of 254 rows and 18 columns.
In [116]:
#splitting the pca data into train and test 70:30
X_tr, X_te, y_tr, y_te = train_test_split(X_pca, y, test_size = 0.3, random_state = 10)
In [117]:
rtr_pca, ctr_pca = X_tr.shape
print('The PCA training set comprises of', rtr_pca, 'rows and', ctr_pca, 'columns.')
The PCA training set comprises of 592 rows and 6 columns.
In [118]:
rt_pca, ct_pca = X_te.shape
print('The PCA test set comprises of', rt_pca, 'rows and', ct_pca, 'columns.')
The PCA test set comprises of 254 rows and 6 columns.
In [119]:
# Building a Support Vector Machine on train data
svc_model = SVC(C= 4, kernel='rbf', gamma='scale')
svc_model.fit(X_train, y_train)
Out[119]:
SVC(C=4)
In [120]:
#predicting on train data
sv_train_predict = svc_model .predict(X_train)
print("Model Accuracy on train: {0:.4f}".format(metrics.accuracy_score(y_train, sv_train_predict)))
print()
Model Accuracy on train: 0.9899

In [121]:
#predicting on test data
sv_test_predict = svc_model .predict(X_test)
print("Model Accuracy on test: {0:.4f}".format(metrics.accuracy_score(y_test, sv_test_predict)))
print()
Model Accuracy on test: 0.9685

In [122]:
#visualization of confusion matrix in the form of a heatmap
cm= confusion_matrix(y_test, sv_test_predict)
plt.figure(figsize = (12, 8))
sns.heatmap(cm, annot = True, cmap = 'RdYlGn', fmt = 'd')
plt.xlabel('Actual Classes', fontsize = 15)
plt.ylabel('Predicted Classes', fontsize = 15)
plt.title('Confusion Matrix for SVM', fontsize = 15);
In [123]:
#printing classification report
print("Classification Report")
print(metrics.classification_report(y_test, sv_test_predict, labels=[0,1,2]))
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      0.99        71
           1       0.98      0.96      0.97       125
           2       0.90      0.97      0.93        58

    accuracy                           0.97       254
   macro avg       0.96      0.97      0.97       254
weighted avg       0.97      0.97      0.97       254

In [124]:
precision_SV, recall_SV, f1_score_SV, support = precision_recall_fscore_support(y_test, sv_test_predict,average='macro')
print('Precision Score :', '%0.2f' % precision_SV)
print('Recall Score :', '%0.2f' % recall_SV)
print('F1-Score:', '%0.2f' % f1_score_SV)
SV_Acc= accuracy_score(y_test, sv_test_predict)
print('Accuracy Score :','%0.2f' % SV_Acc)
Precision Score : 0.96
Recall Score : 0.97
F1-Score: 0.97
Accuracy Score : 0.97
In [125]:
#SVM on the pca data
svc_model_pca = SVC(C= 4, kernel='rbf', gamma='scale')
svc_model_pca.fit(X_tr, y_tr)
Out[125]:
SVC(C=4)
In [126]:
#predicting on train data
sv_tr_predict = svc_model_pca .predict(X_tr)
print("Model Accuracy on train: {0:.4f}".format(metrics.accuracy_score(y_tr, sv_tr_predict)))
print()
Model Accuracy on train: 0.9476

In [127]:
#predicting on test data
sv_te_predict = svc_model_pca .predict(X_te)
print("Model Accuracy on test: {0:.4f}".format(metrics.accuracy_score(y_te, sv_te_predict)))
print()
Model Accuracy on test: 0.9213

In [128]:
#visualization of confusion matrix in the form of a heatmap
cm= confusion_matrix(y_te, sv_te_predict)
plt.figure(figsize = (12, 8))
sns.heatmap(cm, annot = True, cmap = 'RdYlGn', fmt = 'd')
plt.xlabel('Actual Classes', fontsize = 15)
plt.ylabel('Predicted Classes', fontsize = 15)
plt.title('Confusion Matrix for SVM', fontsize = 15);
In [129]:
#printing classification report
print("Classification Report")
print(metrics.classification_report(y_te, sv_te_predict, labels=[0,1,2]))
Classification Report
              precision    recall  f1-score   support

           0       0.96      0.94      0.95        71
           1       0.94      0.91      0.93       125
           2       0.84      0.91      0.88        58

    accuracy                           0.92       254
   macro avg       0.91      0.92      0.92       254
weighted avg       0.92      0.92      0.92       254

In [130]:
precision_SV_pca, recall_SV_pca, f1_score_SV_pca, support_pca = precision_recall_fscore_support(y_te, sv_te_predict,average='macro')
print('Precision Score :', '%0.2f' % precision_SV_pca)
print('Recall Score :', '%0.2f' % recall_SV_pca)
print('F1-Score:', '%0.2f' % f1_score_SV_pca)
SV_Acc_pca= accuracy_score(y_te, sv_te_predict)
print('Accuracy Score :','%0.2f' % SV_Acc_pca)
Precision Score : 0.91
Recall Score : 0.92
F1-Score: 0.92
Accuracy Score : 0.92
In [131]:
modellists = []
modellists.append(['Support Vector Classifier without PCA', SV_Acc * 100, recall_SV * 100, precision_SV * 100,f1_score_SV*100])
modellists.append(['Support Vector Classifier with PCA', SV_Acc_pca* 100, recall_SV_pca * 100, precision_SV_pca * 100,f1_score_SV_pca*100])
mdl_df = pd.DataFrame(modellists, columns = ['Model','Accuracy Score of Test Data', 'Recall Score', 'Precision Score','F1 Score'])
mdl_df
Out[131]:
Model Accuracy Score of Test Data Recall Score Precision Score F1 Score
0 Support Vector Classifier without PCA 96.850394 97.047758 96.227745 96.596702
1 Support Vector Classifier with PCA 92.125984 92.315169 91.352049 91.773898

Bothe the model give more than 90% accuracy on the test data, PCA used only 6 attributes to come up with an accuracy of 90%+ where as the model without pca used all the variables to come up with 90%+ accuracy, the difference can be illustrated even better if the dataset had been cursed with dimensionality, since its 18 variable in the original data the difference is very subtle.

In [ ]: